Data analysis for:

Dylan J. Padilla Perez, 2023. Geographic variation of the for gene reveals signatures of local adaptation in Drosophila melanogaster.

Dylan J. Padilla Perez, School of Life Sciences, Arizona State University, Tempe, AZ 85287, USA.

Library

library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(dplyr)
library(ecodist)
library(ggsn)
library(grid)
library(gridBase)
library(ggplot2)
library(gplots)
library(ggpubr)
library(ggspatial)
library(hierfstat)
library(lattice)
library(LEA)
library(lfmm)
library(MASS)
library(maps)
library(maptools)
library(mapplots)
library(miscTools)
library(naniar)
library(OutFLANK)
library(patchwork)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rgeos)
library(rnaturalearth)
library(rnaturalearthdata)
library(rworldxtra)
library(rworldmap)
library(scales)
library(sf)
library(shape)
library(SeqArray)
library(SeqVarTools)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)

Session information

R.version
                 _                           
  platform       x86_64-apple-darwin17.0     
  arch           x86_64                      
  os             darwin17.0                  
  system         x86_64, darwin17.0          
  status                                     
  major          4                           
  minor          1.1                         
  year           2021                        
  month          08                          
  day            10                          
  svn rev        80725                       
  language       R                           
  version.string R version 4.1.1 (2021-08-10)
  nickname       Kick Things
sessionInfo()
  R version 4.1.1 (2021-08-10)
  Platform: x86_64-apple-darwin17.0 (64-bit)
  Running under: macOS Big Sur 10.16
  
  Matrix products: default
  BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
  LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
  
  locale:
  [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
  
  attached base packages:
  [1] grid      stats     graphics  grDevices utils     datasets  methods  
  [8] base     
  
  other attached packages:
   [1] vegan_2.6-4             permute_0.9-7           vcfR_1.14.0            
   [4] stringr_1.5.0           SNPRelate_1.28.0        SeqVarTools_1.32.0     
   [7] SeqArray_1.34.0         gdsfmt_1.30.0           shape_1.4.6            
  [10] sf_1.0-9                scales_1.2.1            rworldmap_1.3-6        
  [13] rworldxtra_1.01         rnaturalearthdata_0.1.0 rnaturalearth_0.3.2    
  [16] rgeos_0.6-1             reshape_0.8.9           raster_3.6-14          
  [19] randomcoloR_1.1.0.1     poppr_2.9.3             pegas_1.1              
  [22] ape_5.7                 patchwork_1.1.2         OutFLANK_0.2           
  [25] qvalue_2.26.0           naniar_1.0.0            miscTools_0.6-26       
  [28] mapplots_1.5.1          maptools_1.1-6          sp_1.6-0               
  [31] maps_3.4.1              MASS_7.3-58.2           lfmm_1.1               
  [34] LEA_3.6.0               lattice_0.20-45         hierfstat_0.5-11       
  [37] ggspatial_1.1.7         ggpubr_0.6.0            gplots_3.1.3           
  [40] gridBase_0.4-7          ggsn_0.5.0              ecodist_2.0.9          
  [43] dartR_2.7.2             dartR.data_1.0.2        dplyr_1.1.0            
  [46] ggplot2_3.4.1           data.table_1.14.6       ComplexHeatmap_2.10.0  
  [49] car_3.1-1               carData_3.0-5           adegenet_2.1.10        
  [52] ade4_1.7-22             rmarkdown_2.20         
  
  loaded via a namespace (and not attached):
    [1] utf8_1.2.3             R.utils_2.12.2         tidyselect_1.2.0      
    [4] combinat_0.0-8         Rtsne_0.16             StAMPP_1.6.3          
    [7] GWASExactHW_1.01       munsell_0.5.0          codetools_0.2-19      
   [10] units_0.8-1            withr_2.5.0            colorspace_2.1-0      
   [13] progressr_0.13.0       Biobase_2.54.0         highr_0.10            
   [16] knitr_1.42             stats4_4.1.1           ggsignif_0.6.4        
   [19] labeling_0.4.2         RgoogleMaps_1.4.5.3    logistf_1.24.1        
   [22] GenomeInfoDbData_1.2.7 farver_2.1.1           gap.datasets_0.0.5    
   [25] vctrs_0.5.2            generics_0.1.3         xfun_0.37             
   [28] GenomeInfoDb_1.30.1    R6_2.5.1               doParallel_1.0.17     
   [31] clue_0.3-64            fields_14.1            bitops_1.0-7          
   [34] cachem_1.0.6           promises_1.2.0.1       pinfsc50_1.2.0        
   [37] gtable_0.3.1           formula.tools_1.7.1    spam_2.9-1            
   [40] rlang_1.0.6            calibrate_1.7.7        GlobalOptions_0.1.2   
   [43] splines_4.1.1          rstatix_0.7.2          rgdal_1.6-4           
   [46] broom_1.0.3            yaml_2.3.7             reshape2_1.4.4        
   [49] abind_1.4-5            backports_1.4.1        httpuv_1.6.9          
   [52] tools_4.1.1            ellipsis_0.3.2         jquerylib_0.1.4       
   [55] RColorBrewer_1.1-3     proxy_0.4-27           BiocGenerics_0.40.0   
   [58] Rcpp_1.0.10            plyr_1.8.8             zlibbioc_1.40.0       
   [61] RCurl_1.98-1.10        classInt_0.4-8         purrr_1.0.1           
   [64] GetoptLong_1.0.5       viridis_0.6.2          S4Vectors_0.32.4      
   [67] ggmap_3.0.1            cluster_2.1.4          magrittr_2.0.3        
   [70] RSpectra_0.16-1        genetics_1.3.8.1.3     circlize_0.4.15       
   [73] mvtnorm_1.1-3          matrixStats_0.63.0     mime_0.12             
   [76] evaluate_0.20          xtable_1.8-4           jpeg_0.1-10           
   [79] IRanges_2.28.0         gridExtra_2.3          compiler_4.1.1        
   [82] mice_3.15.0            tibble_3.1.8           KernSmooth_2.23-20    
   [85] V8_4.2.2               crayon_1.5.2           gdistance_1.6         
   [88] R.oo_1.25.0            htmltools_0.5.4        mgcv_1.8-41           
   [91] later_1.3.0            tidyr_1.3.0            visdat_0.6.0          
   [94] DBI_1.1.3              PopGenReport_3.0.7     boot_1.3-28.1         
   [97] Matrix_1.5-1           cli_3.6.0              R.methodsS3_1.8.2     
  [100] gdata_2.18.0.1         parallel_4.1.1         dotCall64_1.0-2       
  [103] igraph_1.4.0           GenomicRanges_1.46.1   pkgconfig_2.0.3       
  [106] foreign_0.8-84         terra_1.7-3            foreach_1.5.2         
  [109] memuse_4.2-3           bslib_0.4.2            XVector_0.34.0        
  [112] digest_0.6.31          polysat_1.7-7          Biostrings_2.62.0     
  [115] operator.tools_1.6.3   curl_5.0.0             gap_1.5-1             
  [118] shiny_1.7.4            gtools_3.9.4           rjson_0.2.21          
  [121] lifecycle_1.0.3        nlme_3.1-162           dismo_1.3-9           
  [124] jsonlite_1.8.4         seqinr_4.2-23          viridisLite_0.4.1     
  [127] fansi_1.0.4            pillar_1.8.1           GGally_2.1.2          
  [130] fastmap_1.1.0          httr_1.4.4             glue_1.6.2            
  [133] mmod_1.3.3             png_0.1-8              iterators_1.0.14      
  [136] class_7.3-21           stringi_1.7.12         sass_0.4.5            
  [139] caTools_1.18.2         e1071_1.7-13



Quality control and SNP isolation

## Setting a seed

set.seed(94)


## load meta-data file

samps <- fread("samps_10Nov2020.csv")

## open GDS for common SNPs (PoolSNP)

genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)

## common SNP.dt

snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
                     pos = seqGetData(genofile, "position"),
                     nAlleles = seqGetData(genofile, "$num_allele"),
                     id = seqGetData(genofile, "variant.id"),
                     genotype = seqGetData(genofile, "allele"))

snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)
## # of selected variants: 4,281,164
## filter to target

snp.tmp <- data.table(chr ="2L", pos = 3622086:3656954)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)
## # of selected variants: 1,613
## get annotations

message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data

snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))

## Extract data between the 2nd and third | symbol

snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]

## Collapse additional annotations to original SNP vector length

snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]

snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]

## get frequencies

message("Allele Freqs")

ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")

af <- data.table(ad = expand.grid(ad$data)[,1],
                 dp = expand.grid(dp)[,1],
                 sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
                 variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))

## merge them together

message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")

afi[ , af := ad/dp]

## calculate effective read-depth

afis <- merge(afi, samps, by = "sampleId")

afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]


## subsetting dataset

names(afis)
##   [1] "sampleId"       "variant.id"     "ad"             "dp"            
##   [5] "n"              "col"            "gene"           "chr"           
##   [9] "pos"            "nAlleles"       "genotype"       "af"            
##  [13] "locality"       "country"        "city"           "collectionDate"
##  [17] "DateExact"      "lat"            "long"           "season"        
##  [21] "type"           "continent"      "set"            "nFlies"        
##  [25] "SRA_accession"  "SRA_experiment" "Model"          "collector"     
##  [29] "sampleType"     "year"           "yday"           "stationId"     
##  [33] "dist_km"        "In(2L)t"        "In(2R)Ns"       "In(3L)P"       
##  [37] "In(3R)C"        "In(3R)K"        "In(3R)Mo"       "In(3R)Payne"   
##  [41] "status"         "bio1"           "bio2"           "bio3"          
##  [45] "bio4"           "bio5"           "bio6"           "bio7"          
##  [49] "bio8"           "bio9"           "bio10"          "bio11"         
##  [53] "bio12"          "bio13"          "bio14"          "bio15"         
##  [57] "bio16"          "bio17"          "bio18"          "bio19"         
##  [61] "tmin1"          "tmin2"          "tmin3"          "tmin4"         
##  [65] "tmin5"          "tmin6"          "tmin7"          "tmin8"         
##  [69] "tmin9"          "tmin10"         "tmin11"         "tmin12"        
##  [73] "tmax1"          "tmax2"          "tmax3"          "tmax4"         
##  [77] "tmax5"          "tmax6"          "tmax7"          "tmax8"         
##  [81] "tmax9"          "tmax10"         "tmax11"         "tmax12"        
##  [85] "prec1"          "prec2"          "prec3"          "prec4"         
##  [89] "prec5"          "prec6"          "prec7"          "prec8"         
##  [93] "prec9"          "prec10"         "prec11"         "prec12"        
##  [97] "propSimNorm"    "sex"            "nEff"           "af_nEff"
season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)
## 'data.frame':    438736 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ sampleId  : chr  "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
##  $ locality  : chr  "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
##  $ long      : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ lat       : num  48.4 48.4 48.4 48.4 48.4 ...
##  $ season    : chr  "spring" "spring" "spring" "spring" ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  1 0 0.1569 0.0196 NA ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  28 26 31 31 NA 36 36 33 24 30 ...
head(season.dat)
##     type year     sampleId locality country        city  long    lat season
## 1 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
##   variant.id genotype         af                 col continent nEff
## 1     162333      A,C 1.00000000 3_prime_UTR_variant    Europe   28
## 2     162334      A,C 0.00000000 3_prime_UTR_variant    Europe   26
## 3     162335      C,T 0.15686275 3_prime_UTR_variant    Europe   31
## 4     162336      T,C 0.01960784 3_prime_UTR_variant    Europe   31
## 5     162337      C,T         NA 3_prime_UTR_variant    Europe   NA
## 6     162338      C,T 0.00000000 3_prime_UTR_variant    Europe   36
dim(season.dat)
## [1] 438736     15
season.dat$season[season.dat$season == "frost"] <- "fall"

season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()

localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")

## getting samples collected at least once during spring and winter

for(loc in localities){
    dft <- season.dat[season.dat$locality == loc, ]
    if(all(seasons %in% unique(dft$season))){
        season.filter.dat <- rbind(season.filter.dat, dft)
    }
}

dim(season.filter.dat)
## [1] 285501     15
dim(na.omit(season.filter.dat))
## [1] 269454     15
str(season.filter.dat)
## 'data.frame':    285501 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ locality  : chr  "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ long      : num  16.4 16.4 16.4 16.4 16.4 ...
##  $ lat       : num  48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...
mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)
## [1] "sampleId"   "city"       "country"    "season"     "variant.id"
## [6] "continent"  "af"         "nEff"
str(mat1)
## 'data.frame':    285501 obs. of  8 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...
dim(mat1)
## [1] 285501      8
mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)
## [1] 269454      8
mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)
## [1] 212017      8
mat1 <- mat1[ , -8]
str(mat1)
## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...
## Changing name of cities

mat1$city <- as.factor(mat1$city)
levels(mat1$city)
##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet \xe0 Gobet"           "Charlottesville"            
##  [9] "Charlotttesville"            "Chornobyl (Cooling pond)"   
## [11] "Chornobyl Applegarden"       "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi"           "Chornobyl Polisske"         
## [15] "Chornobyl Yaniv"             "Cross Plains"               
## [17] "Drogobych"                   "Esparto"                    
## [19] "Gimenells"                   "Gotheron"                   
## [21] "Gross-Enzersdorf"            "Homestead"                  
## [23] "Ithaca"                      "Kalna"                      
## [25] "Karensminde"                 "Karensminde Orchard"        
## [27] "Kharkiv"                     "Kyiv"                       
## [29] "Kyiv (Vyshneve)"             "Lancaster"                  
## [31] "Linvilla"                    "Market Harborough"          
## [33] "Mauternbach"                 "Munich"                     
## [35] "Odesa"                       "Odessa"                     
## [37] "Sheffield"                   "Slankamen. Vinogradi"       
## [39] "State College"               "Sudbury"                    
## [41] "Topeka"                      "Tuolumne"                   
## [43] "valday"                      "Valday"                     
## [45] "Viltain"                     "Yesiloz"                    
## [47] "Yesiloz TR13"                "Yesiloz TR14"               
## [49] "Yesiloz TR15"                "Yesiloz TR16"               
## [51] "Yesiloz TR17"                "Yesiloz TR18"
levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))

levels(mat1$city)
##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet-A-Gobet"              "Charlottesville"            
##  [9] "Chornobyl (Cooling pond)"    "Chornobyl Applegarden"      
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"          
## [13] "Chornobyl Polisske"          "Chornobyl Yaniv"            
## [15] "Cross Plains"                "Drogobych"                  
## [17] "Esparto"                     "Gimenells"                  
## [19] "Gotheron"                    "Gross-Enzersdorf"           
## [21] "Homestead"                   "Ithaca"                     
## [23] "Kalna"                       "Karensminde"                
## [25] "Kharkiv"                     "Kyiv"                       
## [27] "Kyiv (Vyshneve)"             "Lancaster"                  
## [29] "Linvilla"                    "Market Harborough"          
## [31] "Mauternbach"                 "Munich"                     
## [33] "Odessa"                      "Sheffield"                  
## [35] "Slankamen-Vinogradi"         "State College"              
## [37] "Sudbury"                     "Topeka"                     
## [39] "Tuolumne"                    "Valday"                     
## [41] "Viltain"                     "Yesiloz"
levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)
##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Kyiv (Vyshneve)"     "Lancaster"           "Linvilla"           
## [25] "Market Harborough"   "Mauternbach"         "Munich"             
## [28] "Odessa"              "Sheffield"           "Slankamen-Vinogradi"
## [31] "State College"       "Sudbury"             "Topeka"             
## [34] "Tuolumne"            "Valday"              "Viltain"            
## [37] "Yesiloz"
levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)
##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Lancaster"           "Linvilla"            "Market Harborough"  
## [25] "Mauternbach"         "Munich"              "Odessa"             
## [28] "Sheffield"           "Slankamen-Vinogradi" "State College"      
## [31] "Sudbury"             "Topeka"              "Tuolumne"           
## [34] "Valday"              "Viltain"             "Yesiloz"
levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)
##  [1] "Akaa"                    "Athens"                 
##  [3] "Barcelona"               "Benton Harbor"          
##  [5] "Broggingen"              "Brzezina"               
##  [7] "Chalet-A-Gobet"          "Charlottesville"        
##  [9] "Chornobyl"               "Cross Plains"           
## [11] "Drogobych"               "Esparto"                
## [13] "Gimenells"               "Gotheron"               
## [15] "Gross-Enzersdorf"        "Homestead"              
## [17] "Ithaca"                  "Kalna"                  
## [19] "Karensminde"             "Kharkiv"                
## [21] "Kyiv"                    "Lancaster"              
## [23] "Linvilla"                "Market Harborough"      
## [25] "Mauternbach"             "Munich"                 
## [27] "Odessa"                  "Sheffield"              
## [29] "Slankamenacki-Vinogradi" "State College"          
## [31] "Sudbury"                 "Topeka"                 
## [33] "Tuolumne"                "Valday"                 
## [35] "Viltain"                 "Yesiloz"
dim(mat1)
## [1] 212017      7
str(mat1)
## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...
mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses

mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]
##              sampleId             city country season continent af.162333
## 9679    AT_gr_12_fall Gross-Enzersdorf Austria   fall    Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring    Europe 0.8888889
## 1        AT_Mau_14_01      Mauternbach Austria spring    Europe 1.0000000
## 1617     AT_Mau_14_02      Mauternbach Austria   fall    Europe        NA
## 3227     AT_Mau_15_50      Mauternbach Austria spring    Europe 0.7714286
## 4840     AT_Mau_15_51      Mauternbach Austria   fall    Europe 0.6984127
##       af.162335  af.162336  af.162337 af.162338
## 9679  0.2272727 0.00000000 0.02439024         0
## 11292        NA 0.00000000         NA        NA
## 1     0.1568627 0.01960784         NA         0
## 1617         NA 0.00000000 0.00000000         0
## 3227  0.1323529 0.00000000 0.00000000         0
## 4840  0.2656250 0.01282051 0.00000000         0
dim(mat2)
## [1]  170 1614
mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)

mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]

usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")

for(i in usa){
    mat2$country[mat2$city == i] <- i
}

mat2$country <- as.factor(mat2$country)

levels(mat2$country)
##  [1] "Austria"         "Benton Harbor"   "Charlottesville" "Cross Plains"   
##  [5] "Denmark"         "Esparto"         "Finland"         "France"         
##  [9] "Germany"         "Ithaca"          "Lancaster"       "Linvilla"       
## [13] "Russia"          "Spain"           "State College"   "Tuolumne"       
## [17] "Turkey"          "Ukraine"
levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "Ithaca"          "Lancaster"       "Linvilla"        "Russia"         
## [13] "Spain"           "State College"   "Tuolumne"        "Turkey"         
## [17] "Ukraine"
levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "Linvilla"        "Russia"          "Spain"          
## [13] "State College"   "Tuolumne"        "Turkey"          "Ukraine"
levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"
mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]
##              162333 162334 162335 162336 162337
## AT_Mau_14_01    C/C    A/A    C/T    T/C   <NA>
## AT_Mau_14_02    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_50    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_51    A/C   <NA>    C/T    T/C    C/C
## AT_See_14_44    A/C    A/C    C/T    T/T    C/C
for(i in 1:ncol(mat3)){
    mat3[ , i] <- gsub("/", "", mat3[, i])
}

mat3[1:5, 1:5]
##              162333 162334 162335 162336 162337
## AT_Mau_14_01     CC     AA     CT     TC   <NA>
## AT_Mau_14_02     AC     AA     CT     TT     CC
## AT_Mau_15_50     AC     AA     CT     TT     CC
## AT_Mau_15_51     AC   <NA>     CT     TC     CC
## AT_See_14_44     AC     AC     CT     TT     CC
mat3$ind <- rownames(mat3)
dim(mat3)
## [1]  272 1614
mat3[1:5, 1610:1614]
##              163994 163995 163996 163997          ind
## AT_Mau_14_01     GA     GG     GC     AA AT_Mau_14_01
## AT_Mau_14_02     GA     GG     GC     AT AT_Mau_14_02
## AT_Mau_15_50     GA     GG     GC     AA AT_Mau_15_50
## AT_Mau_15_51     GA     GG     GC     AA AT_Mau_15_51
## AT_See_14_44     GA     GA     GC     AA AT_See_14_44
mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)
## [1]  152 1614
mat4[1:5, 1600:1614]
##               163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01      AA     AG     TT     CG     AG     CC     AC     CC     AG
## AT_Mau_14_02      AA     AG     TT     CG     AG     CC     AC     CC     AA
## AT_Mau_15_50      AA     AG     TC     CG     AG     CT     AC     CG     AG
## AT_Mau_15_51      AA     AG     TT     CG     AG     CC     AC     CG     AA
## AT_gr_12_fall     AA     AG     TT     CG     AG     CT     AC     CG     AA
##               163993 163994 163995 163996 163997           ind
## AT_Mau_14_01      TC     GA     GG     GC     AA  AT_Mau_14_01
## AT_Mau_14_02      TC     GA     GG     GC     AT  AT_Mau_14_02
## AT_Mau_15_50      TC     GA     GG     GC     AA  AT_Mau_15_50
## AT_Mau_15_51      TC     GA     GG     GC     AA  AT_Mau_15_51
## AT_gr_12_fall     TC     GA     GG     GC     AA AT_gr_12_fall
same_ord <- mat2[ , c(1, 2, 3)]
colnames(same_ord) <- c("ind", "city", "country")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)
## [1]  152 1616
mat5[1:5, 1:5]
##               ind 162333 162334 162335 162336
## 1   AT_gr_12_fall     AC   <NA>     CT     TT
## 2 AT_gr_12_spring     AC     AA     CT     TT
## 3    AT_Mau_14_01     CC     AA     CT     TC
## 4    AT_Mau_14_02     AC     AA     CT     TT
## 5    AT_Mau_15_50     AC     AA     CT     TT
rownames(mat5) <- mat5[ , 1]
dim(mat5)
## [1]  152 1616
mat5[1:5, 1610:1616]
##                 163993 163994 163995 163996 163997             city country
## AT_gr_12_fall       TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_gr_12_spring     TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_Mau_14_01        TC     GA     GG     GC     AA      Mauternbach Austria
## AT_Mau_14_02        TC     GA     GG     GC     AT      Mauternbach Austria
## AT_Mau_15_50        TC     GA     GG     GC     AA      Mauternbach Austria
mat6 <- mat5[ , -c(1, 1615, 1616)]
dim(mat6)
## [1]  152 1613
mat6[1:5, 1:5]
##                 162333 162334 162335 162336 162337
## AT_gr_12_fall       AC   <NA>     CT     TT     CT
## AT_gr_12_spring     AC     AA     CT     TT     CC
## AT_Mau_14_01        CC     AA     CT     TC   <NA>
## AT_Mau_14_02        AC     AA     CT     TT     CC
## AT_Mau_15_50        AC     AA     CT     TT     CC
mat6[1:5, 1609:1613]
##                 163993 163994 163995 163996 163997
## AT_gr_12_fall       TC     GA     GG     GC     AA
## AT_gr_12_spring     TC     GA     GG     GC     AA
## AT_Mau_14_01        TC     GA     GG     GC     AA
## AT_Mau_14_02        TC     GA     GG     GC     AT
## AT_Mau_15_50        TC     GA     GG     GC     AA
fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen
## /// GENIND OBJECT /////////
## 
##  // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  152 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country, 
##     ploidy = 2)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
fly_gen$tab[1:5, 1:5]
##                 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall          1        1       NA       NA        1
## AT_gr_12_spring        1        1        2        0        1
## AT_Mau_14_01           0        2        2        0        1
## AT_Mau_14_02           1        1        2        0        1
## AT_Mau_15_50           1        1        2        0        1
summary(fly_gen$pop)
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              18 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]
## PA_li_10_spring 
##       0.7123373
barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
## quality control begins here

mlg(fly_gen) ## keep only polymorphic loci
## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################
## [1] 151
isPoly(fly_gen) %>% summary
##    Mode   FALSE    TRUE 
## logical       1    1612
poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary
##    Mode    TRUE 
## logical    1612
fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]

n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3224 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3224 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
indmiss_fly2 <- propTyped(fly_gen2, by = "ind")

barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes
##    162334    162396    162412    162545    162546    162547    162549    162551 
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795 
##    162553    162554    162555    162557    162560    162563    162587    162623 
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013 
##    162746    162755    162756    162882    162892    162986    162987    162988 
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768 
##    162989    162990    162992    163029    163072    163073    163074    163075 
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437 
##    163076    163118    163119    163120    163121    163122    163123    163124 
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940 
##    163125    163307    163363    163365    163366    163608    163648    163650 
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815 
##    163692    163715    163716    163717    163765    163821    163908    163910 
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139 
##    163914    163915    163916    163917    163922    163958 
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868
barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]
## named numeric(0)
fly_gen3$tab[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
summary(fly_gen3$pop)
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
mlg(fly_gen3)
## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################
## [1] 151
isPoly(fly_gen3) %>% summary
##    Mode    TRUE 
## logical    1550
poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary
##    Mode    TRUE 
## logical    1550
barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

## computing basic stats

basic_fly <- basic.stats(fly_gen3, diploid = TRUE)

## testing for H-W equilibrium

## Chi-squared test: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 0)[ , 3]))


HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]
##                     162333     162335    162336    162337    162338
## Austria         0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI           0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark         0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto         0.23013934 0.04550026 1.0000000 0.7750970 0.8037847
## Monte Carlo: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))

HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]
##                 162333 162335 162336 162337 162338
## Austria          0.404  0.087      1      1      1
## MI.WI            0.437  0.174      1      1      1
## Charlottesville  1.000  0.147      1      1      1
## Denmark          1.000  0.111      1      1      1
## Esparto          1.000  0.320      1      1      1
alpha <- 0.05

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))

Prop.loci.out.of.HWE[1:10, ]
##            Chisq        MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000
## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests

Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"), 
                    nrow = nrow(HWE.test.chisq))

MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"), 
                    nrow = nrow(HWE.test.MC))

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean), 
           MC = apply(HWE.test.MC<alpha, 2, mean),
           Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
           MC.fdr = apply(MC.fdr<alpha, 2, mean))

head(Prop.loci.out.of.HWE)
##            Chisq        MC Chisq.fdr    MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.1333333
## 162335 1.0000000 0.4000000 0.6000000 0.2000000
## 162336 0.0000000 0.0000000 0.0000000 0.0000000
## 162337 0.0000000 0.0000000 0.0000000 0.0000000
## 162338 0.0000000 0.0000000 0.0000000 0.0000000
## 162339 0.0000000 0.0000000 0.0000000 0.0000000
loci <- data.frame()

idx <- 1

for(i in 1:nrow(Prop.loci.out.of.HWE)){
    fdr <- Prop.loci.out.of.HWE[i , ]
    if(fdr$MC.fdr > 0.5){
        loci <- rbind(fdr, loci)
    }

}

loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.
## data frame with 0 columns and 0 rows
dim(loci)
## [1] 0 0
## check for linkage disequilibrium (LD)

LD <- as.genclone(fly_gen3)
LD
## 
## This is a genclone object
## -------------------------
## Genotype information:
## 
##     151 original multilocus genotypes 
##     151 diploid individuals
##    1550 codominant loci
## 
## Population information:
## 
##       0 strata. 
##      15 populations defined - 
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine
quartz()
LD.general <- poppr::ia(LD, sample = 500)

LD.general
##           Ia         p.Ia        rbarD         p.rD 
## 15.592940300  0.001996008  0.013771123  0.001996008



Computing pairwise Fst test

fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)
fly_fst[[1:10, 1:10]]
##                 Austria MI-WI Charlottesville Denmark Esparto Finland France
## MI-WI             0.053                                                     
## Charlottesville   0.058 0.005                                               
## Denmark           0.018 0.049           0.054                               
## Esparto           0.056 0.012           0.017   0.049                       
## Finland           0.015 0.049           0.057   0.024   0.054               
## France            0.021 0.042           0.045   0.018   0.041   0.022       
## Germany           0.012 0.041           0.046   0.016   0.043   0.013  0.010
## NY-MA             0.044 0.006           0.003   0.040   0.009   0.044  0.033
## PA                0.053 0.004           0.004   0.050   0.013   0.051  0.044
##                 Germany NY-MA
## MI-WI                        
## Charlottesville              
## Denmark                      
## Esparto                      
## Finland                      
## France                       
## Germany                      
## NY-MA             0.034      
## PA                0.041 0.002
fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]
##                 Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria           0.000 0.053           0.058   0.018   0.056   0.015  0.021
## MI-WI             0.053 0.000           0.005   0.049   0.012   0.049  0.042
## Charlottesville   0.058 0.005           0.000   0.054   0.017   0.057  0.045
## Denmark           0.018 0.049           0.054   0.000   0.049   0.024  0.018
## Esparto           0.056 0.012           0.017   0.049   0.000   0.054  0.041
## Finland           0.015 0.049           0.057   0.024   0.054   0.000  0.022
## France            0.021 0.042           0.045   0.018   0.041   0.022  0.000
## Germany           0.012 0.041           0.046   0.016   0.043   0.013  0.010
## NY-MA             0.044 0.006           0.003   0.040   0.009   0.044  0.033
## PA                0.053 0.004           0.004   0.050   0.013   0.051  0.044
##                 Germany NY-MA    PA
## Austria           0.012 0.044 0.053
## MI-WI             0.041 0.006 0.004
## Charlottesville   0.046 0.003 0.004
## Denmark           0.016 0.040 0.050
## Esparto           0.043 0.009 0.013
## Finland           0.013 0.044 0.051
## France            0.010 0.033 0.044
## Germany           0.000 0.034 0.041
## NY-MA             0.034 0.000 0.002
## PA                0.041 0.002 0.000
tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object
## Starting gi2gl 
## Starting gl.compliance.check 
##   Processing genlight object with SNP data
##   Checking coding of SNPs
##     SNP data scored NA, 0, 1 or 2 confirmed
##   Checking locus metrics and flags
##   Recalculating locus metrics
##   Checking for monomorphic loci
##     No monomorphic loci detected
##   Checking for loci with all missing data
##     No loci with all missing data detected
##   Checking whether individual names are unique.
##   Checking for individual metrics
##   Warning: Creating a slot for individual metrics
##   Checking for population assignments
##     Population assignments confirmed
##   Spelling of coordinates checked and changed if necessary to 
##             lat/lon
## Completed: gl.compliance.check 
## Completed: gi2gl
pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)
## Starting gl.fst.pop 
##   Processing genlight object with SNP data
## Completed: gl.fst.pop
fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst
##                    Austria     Esparto    Tuolumne     Germany    Denmark
## Austria         0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto         0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne        0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany         0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark         0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain           0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland         0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France          0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA           0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI           0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA              0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia          0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey          0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine         0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
##                       Spain    Finland     France       NY-MA       MI-WI
## Austria         0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto         0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne        0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany         0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark         0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain           0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland         0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France          0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA           0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI           0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA              0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia          0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey          0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine         0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
##                          PA      Russia     Turkey     Ukraine Charlottesville
## Austria         0.053102204 0.016899657 0.01412191 0.014163086     0.058198201
## Esparto         0.013471651 0.053516381 0.06054132 0.063774691     0.017272103
## Tuolumne        0.009573028 0.060654501 0.06292115 0.069481577     0.015222898
## Germany         0.041313299 0.020656074 0.01890040 0.020327235     0.046035247
## Denmark         0.050159598 0.023328359 0.02179383 0.023697489     0.054027742
## Spain           0.038958914 0.031639501 0.02680916 0.031831900     0.046471038
## Finland         0.051458012 0.019135166 0.01677821 0.022452499     0.057285510
## France          0.043675456 0.021999335 0.02823027 0.028812985     0.044509519
## NY-MA           0.001936861 0.047419819 0.04935836 0.055798661     0.002951612
## MI-WI           0.004351338 0.052957069 0.05430445 0.064183546     0.004506388
## PA              0.000000000 0.058255707 0.05610334 0.066041832     0.004019586
## Russia          0.058255707 0.000000000 0.01717863 0.007515034     0.059645911
## Turkey          0.056103340 0.017178630 0.00000000 0.012939846     0.058036081
## Ukraine         0.066041832 0.007515034 0.01293985 0.000000000     0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473     0.000000000



my_palette <- colorRampPalette(c("green", "yellow", "red"))(n = 100)

heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette)

Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.



OutFlANK outlier analysis

fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
snps <- as.data.frame(nAll(fly_gen3))
indi <- as.data.frame(fly_gen3$tab)

rownames(snps)[1:10]
##  [1] "162333" "162335" "162336" "162337" "162338" "162339" "162340" "162341"
##  [9] "162342" "162343"
rownames(indi)[1:10]
##  [1] "AT_gr_12_fall"   "AT_gr_12_spring" "AT_Mau_14_01"    "AT_Mau_14_02"   
##  [5] "AT_Mau_15_50"    "AT_Mau_15_51"    "CA_es_12_fall"   "CA_es_12_frost" 
##  [9] "CA_es_12_spring" "CA_es_13_fall"
seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
## # of selected samples: 151
## # of selected variants: 1,550
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)

outest <- read.vcfR("my_vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 44
##   header_line: 45
##   variant count: 1550
##   column count: 160
## 
Meta line 44 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 1550
##   Character matrix gt cols: 160
##   skip: 0
##   nrows: 1550
##   row_num: 0
## 
Processed variant 1000
Processed variant: 1550
## All variants processed
dataout <- vcfR2genind(outest)
dataout$pop <- as.factor(fly_gen3$pop)

outflnk <- gl.outflank(dataout, qthreshold = 0.05, plot = FALSE)
## Calculating FSTs, may take a few minutes...
outflnk.df <- outflnk$outflank$results

rowsToRemove <- seq(1, nrow(outflnk.df), by = 2)
outflnk.df <- outflnk.df[-rowsToRemove, ]

outflnk.df$OutlierFlag %>% summary
##    Mode   FALSE    NA's 
## logical    1237     313
outflnk.df <- na.omit(outflnk.df)

outflnk.df$FST[outflnk.df$FST < 0] <- 0
head(outflnk.df)
##       LocusName        He         FST            T1         T2   FSTNoCorr
## 2  2L_3622148.0 0.4791145 0.009766519  2.345072e-03 0.24011342 0.028452713
## 4  2L_3622167.1 0.4998965 0.000000000 -2.026437e-05 0.24997177 0.001549929
## 6  2L_3622402.1 0.1118148 0.032767588  1.844529e-03 0.05629127 0.081124841
## 8  2L_3622520.1 0.1699219 0.086990263  7.484706e-03 0.08604073 0.127168698
## 10 2L_3622544.1 0.1638000 0.028275732  2.329770e-03 0.08239470 0.072847321
## 12 2L_3622548.1 0.2225125 0.076400369  8.594932e-03 0.11249857 0.113994917
##        T1NoCorr   T2NoCorr meanAlleleFreq indexOrder GoodH   qvalues    pvalues
## 2  0.0068386640 0.24035192      0.6021898          2 goodH 0.6691213 0.68774049
## 4  0.0003874701 0.24999218      0.5071942          4 goodH 0.7222558 0.04538728
## 6  0.0045779572 0.05643102      0.9405594          6 goodH 0.4579630 0.60148959
## 8  0.0109645441 0.08622046      0.9062500          8 goodH 0.3903686 0.30413760
## 10 0.0060152736 0.08257371      0.9100000         10 goodH 0.4733307 0.67993881
## 12 0.0128484360 0.11271060      0.8724832         12 goodH 0.4080323 0.36966216
##    pvaluesRightTail OutlierFlag
## 2         0.6561298       FALSE
## 4         0.9773064       FALSE
## 6         0.3007448       FALSE
## 8         0.1520688       FALSE
## 10        0.3399694       FALSE
## 12        0.1848311       FALSE
## checking assumptions

plot(outflnk.df$FST, outflnk.df$FSTNoCorr, pch = 19, col = alpha("black", 0.3), ylab = "Fst uncorrected", xlab = "Fst corrected", las = 1, main = "Loci deviation from the linear relationship")
abline(0, 1, col = "skyblue", lwd = 2)



## Italic labels

fstlab <- expression(italic("F")[ST])
hetlab <- expression(italic("H")[e])


## Plot He versus Fst

outFlank <- ggplot(data = outflnk.df) +
  geom_point(aes(x = He, y = FST, colour = OutlierFlag)) +
  scale_colour_manual(values = c("black","red"), labels = c("Neutral SNP","Outlier SNP")) +
  xlab(hetlab) +
  ylab(fstlab) +
  theme(legend.title = element_blank(),
  )


outFlank

Figure 3. Manhattan plot of the distribution of Fst loci as a function of their heterozygosity. No Fst loci showed significant deviation from the genetic background.



Genetic admixture proportions

## exploring the data once again (do it all the time!)

fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
fly_gen3$tab[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
nLoc(fly_gen3) # number of loci
## [1] 1550
nPop(fly_gen3) # number of sites
## [1] 15
nInd(fly_gen3) # number of individuals
## [1] 151
summary(fly_gen3$pop) # sample size
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
snmf1 <- load.snmfProject("gl_geno.snmfProject")


## extract Q-matrix for the best run

plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)

## extract the cross-entropy of all runs where K = 9

ce <- cross.entropy(snmf1, K = 9)
ce
##            K = 9
## run 1  0.2868380
## run 2  0.2973042
## run 3  0.2951137
## run 4  0.2993486
## run 5  0.2950550
## run 6  0.2978129
## run 7  0.2963995
## run 8  0.3005061
## run 9  0.2952275
## run 10 0.2980091
## run 11 0.2930379
## run 12 0.2862978
## run 13 0.2982294
## run 14 0.2954692
## run 15 0.3006121
## run 16 0.3052267
## run 17 0.2922759
## run 18 0.2974110
## run 19 0.3013653
## run 20 0.3033054
## run 21 0.2973955
## run 22 0.3023131
## run 23 0.3005838
## run 24 0.2985870
## run 25 0.3010128
## run 26 0.2873666
## run 27 0.2937488
## run 28 0.2929741
## run 29 0.3025899
## run 30 0.2974862
## run 31 0.2986869
## run 32 0.3019270
## run 33 0.2917947
## run 34 0.2929909
## run 35 0.2986107
## run 36 0.2974049
## run 37 0.3030945
## run 38 0.2906817
## run 39 0.3023015
## run 40 0.3027009
## run 41 0.3007979
## run 42 0.3054084
## run 43 0.3001154
## run 44 0.2986051
## run 45 0.3028176
## run 46 0.2986939
## run 47 0.3030751
## run 48 0.3048518
## run 49 0.3040839
## run 50 0.3093110
## find the run with the lowest cross-entropy

lowest.ce <- which.min(ce)
lowest.ce
## [1] 12
## extract Q-matrix for the best run

qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)
##            V1          V2          V3          V4        V5          V6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
##            V7          V8          V9
## 1 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.000099973 0.262751000
## 3 2.46401e-02 0.188868000 0.000099973
## 4 5.04343e-02 0.648635000 0.101599000
## 5 9.99550e-05 0.230907000 0.000099955
## 6 8.43814e-02 0.449977000 0.036154800
## changing order of levels

pops <- fly_gen3$pop
levels(pops)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"
levels(pops)[3] <- "Charlott."


qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]

pops <- qmplot$pops
layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                2, 2, 2, 2,
                2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))

par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.345, line = 2.8, font = 2, family = "serif", las = 1)

par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n",  ylab = "Admixture proportion", las = 1, cex.lab = 1.4)

## adding population labels to the axis:

names <- unique(as.character(pops))

medians <- c()

for(i in 1:length(pops)){
    
    axis(1, at = median(which(pops == pops[i])), labels = "")
    medians <- c(medians, median(which(pops == pops[i])))
}

medians[1:10]
##  [1] 3 3 3 3 3 8 8 8 8 8
text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Individuals", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)

Figure 4. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k=1 and k=10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Colors indicate genetic clusters.



## label column names of qmatrix

ncol(qmatrix)
## [1] 9
cluster_names = c()

for(i in 1:ncol(qmatrix)){
  cluster_names[i] = paste("Cluster", i)
}

cluster_names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"
colnames(qmatrix) <- cluster_names
head(qmatrix)
##     Cluster 1   Cluster 2   Cluster 3   Cluster 4 Cluster 5   Cluster 6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
##     Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.000099973 0.262751000
## 3 2.46401e-02 0.188868000 0.000099973
## 4 5.04343e-02 0.648635000 0.101599000
## 5 9.99550e-05 0.230907000 0.000099955
## 6 8.43814e-02 0.449977000 0.036154800
dim(qmatrix)
## [1] 151   9
## add individual IDs

qmatrix$Ind <- indNames(fly_gen3)

## add site IDs

qmatrix$Site <- fly_gen3$pop
head(qmatrix)
##     Cluster 1   Cluster 2   Cluster 3   Cluster 4 Cluster 5   Cluster 6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
##     Cluster 7   Cluster 8   Cluster 9             Ind    Site
## 1 9.99639e-05 0.040000000 0.017229500   AT_gr_12_fall Austria
## 2 9.99730e-05 0.000099973 0.262751000 AT_gr_12_spring Austria
## 3 2.46401e-02 0.188868000 0.000099973    AT_Mau_14_01 Austria
## 4 5.04343e-02 0.648635000 0.101599000    AT_Mau_14_02 Austria
## 5 9.99550e-05 0.230907000 0.000099955    AT_Mau_15_50 Austria
## 6 8.43814e-02 0.449977000 0.036154800    AT_Mau_15_51 Austria
## calculate mean admixture proportions for each site

clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)
##   country             Ind   Cluster 1   Cluster 2   Cluster 3   Cluster 4
## 1 Austria   AT_gr_12_fall 0.048722400 0.309613000 9.99639e-05 9.99639e-05
## 2 Austria AT_gr_12_spring 0.261074000 0.049324700 8.74701e-03 9.99730e-05
## 3 Austria    AT_Mau_14_01 0.126625000 0.290528000 9.99730e-05 9.99730e-05
## 4 Austria    AT_Mau_14_02 0.083122600 0.000099973 1.66848e-02 9.99730e-05
## 5 Austria    AT_Mau_15_50 0.000099955 0.529978000 9.99550e-05 9.55478e-03
## 6 Austria    AT_Mau_15_51 0.000099964 0.096673000 9.99640e-05 9.99640e-05
##   Cluster 5   Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 0.5840360 9.99639e-05 9.99639e-05 0.040000000 0.017229500
## 2 0.2930510 1.24752e-01 9.99730e-05 0.000099973 0.262751000
## 3 0.3630220 6.01603e-03 2.46401e-02 0.188868000 0.000099973
## 4 0.0992241 9.99730e-05 5.04343e-02 0.648635000 0.101599000
## 5 0.2290610 9.99550e-05 9.99550e-05 0.230907000 0.000099955
## 6 0.3324140 9.99640e-05 8.43814e-02 0.449977000 0.036154800
str(avg_admix)
## 'data.frame':    151 obs. of  11 variables:
##  $ country  : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0487 0.2611 0.1266 0.0831 0.0001 ...
##  $ Cluster 2: num  0.3096 0.0493 0.2905 0.0001 0.53 ...
##  $ Cluster 3: num  0.0001 0.00875 0.0001 0.01668 0.0001 ...
##  $ Cluster 4: num  0.0001 0.0001 0.0001 0.0001 0.00955 ...
##  $ Cluster 5: num  0.584 0.2931 0.363 0.0992 0.2291 ...
##  $ Cluster 6: num  0.0001 0.12475 0.00602 0.0001 0.0001 ...
##  $ Cluster 7: num  0.0001 0.0001 0.0246 0.0504 0.0001 ...
##  $ Cluster 8: num  0.04 0.0001 0.1889 0.6486 0.2309 ...
##  $ Cluster 9: num  0.0172 0.2628 0.0001 0.1016 0.0001 ...
## import csv file containing coordinates

coor <- read.csv("coor.csv")
str(coor)
## 'data.frame':    16 obs. of  3 variables:
##  $ country: chr  "Austria" "Esparto" "Tuolumne" "Germany" ...
##  $ long   : num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ lat    : num  48.3 38.7 38 48.2 55.9 ...
head(coor)
##    country        long      lat
## 1  Austria   15.965000 48.28750
## 2  Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4  Germany    9.714757 48.19901
## 5  Denmark   10.212586 55.94513
## 6    Spain    1.279236 41.51821
admix <- merge(coor, avg_admix, by = "country")
str(admix)
## 'data.frame':    151 obs. of  13 variables:
##  $ country  : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ long     : num  16 16 16 16 16 ...
##  $ lat      : num  48.3 48.3 48.3 48.3 48.3 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0487 0.2611 0.1266 0.0831 0.0001 ...
##  $ Cluster 2: num  0.3096 0.0493 0.2905 0.0001 0.53 ...
##  $ Cluster 3: num  0.0001 0.00875 0.0001 0.01668 0.0001 ...
##  $ Cluster 4: num  0.0001 0.0001 0.0001 0.0001 0.00955 ...
##  $ Cluster 5: num  0.584 0.2931 0.363 0.0992 0.2291 ...
##  $ Cluster 6: num  0.0001 0.12475 0.00602 0.0001 0.0001 ...
##  $ Cluster 7: num  0.0001 0.0001 0.0246 0.0504 0.0001 ...
##  $ Cluster 8: num  0.04 0.0001 0.1889 0.6486 0.2309 ...
##  $ Cluster 9: num  0.0172 0.2628 0.0001 0.1016 0.0001 ...
head(admix)
##   country   long     lat             Ind   Cluster 1   Cluster 2   Cluster 3
## 1 Austria 15.965 48.2875   AT_gr_12_fall 0.048722400 0.309613000 9.99639e-05
## 2 Austria 15.965 48.2875 AT_gr_12_spring 0.261074000 0.049324700 8.74701e-03
## 3 Austria 15.965 48.2875    AT_Mau_14_01 0.126625000 0.290528000 9.99730e-05
## 4 Austria 15.965 48.2875    AT_Mau_14_02 0.083122600 0.000099973 1.66848e-02
## 5 Austria 15.965 48.2875    AT_Mau_15_50 0.000099955 0.529978000 9.99550e-05
## 6 Austria 15.965 48.2875    AT_Mau_15_51 0.000099964 0.096673000 9.99640e-05
##     Cluster 4 Cluster 5   Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.5840360 9.99639e-05 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.2930510 1.24752e-01 9.99730e-05 0.000099973 0.262751000
## 3 9.99730e-05 0.3630220 6.01603e-03 2.46401e-02 0.188868000 0.000099973
## 4 9.99730e-05 0.0992241 9.99730e-05 5.04343e-02 0.648635000 0.101599000
## 5 9.55478e-03 0.2290610 9.99550e-05 9.99550e-05 0.230907000 0.000099955
## 6 9.99640e-05 0.3324140 9.99640e-05 8.43814e-02 0.449977000 0.036154800
colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")



Plotting admixture proportion on a map

## Plotting map

quartz()

map("world", fill = TRUE, col = "gray", xlim = c(-130, 50), ylim = c(-20, 70), border = 0, mar = c(0, 5.5, 0, 0))

map.axes(cex.axis = 0.7, las = 1)
mtext(side = 1, line = 2, "Longitude")
mtext(side = 2, line = 2, "Latitude")


## Adding pie charts to the map

for(i in 1:nrow(admix)){
    add.pie(z = c(admix$cluster1[i], admix$cluster2[i], admix$cluster3[i], admix$cluster4[i], admix$cluster5[i], admix$cluster6[i], admix$cluster7[i], admix$cluster8[i], admix$cluster9[i]), x = admix$long[i], y = admix$lat[i], radius = 2, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = TRUE)
    
}

Figure 5. Mean admixture proportions across populations of D. melanogaster surveyed in America and Europe. Colors in the pie charts represent genetic clusters.





Isolation by Environment and Isolation by Distance analyses

unique(fly_gen3$pop)
##  [1] Austria         Esparto         Tuolumne        Germany        
##  [5] Denmark         Spain           Finland         France         
##  [9] NY-MA           MI-WI           PA              Russia         
## [13] Turkey          Ukraine         Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine
obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)

idx <- 1

for(i in unique(fly_gen3$pop)){
    long <- admix$long[admix$country == i][1]
    lat <- admix$lat[admix$country == i][1]
    country <- i
    obj[idx, ] <- c(country, long, lat)
    idx = idx + 1

}

obj
##       [,1]              [,2]          [,3]         
##  [1,] "Austria"         "15.965"      "48.2875"    
##  [2,] "Esparto"         "-122.046"    "38.68"      
##  [3,] "Tuolumne"        "-120.26"     "37.97"      
##  [4,] "Germany"         "9.7147565"   "48.199012"  
##  [5,] "Denmark"         "10.212586"   "55.945128"  
##  [6,] "Spain"           "1.279236"    "41.518208"  
##  [7,] "Finland"         "23.52"       "61.1"       
##  [8,] "France"          "3.5442054"   "46.8656662" 
##  [9,] "NY-MA"           "-74.085"     "42.445"     
## [10,] "MI-WI"           "-88.04915"   "42.61055"   
## [11,] "PA"              "-76.6350005" "40.3366975" 
## [12,] "Russia"          "33.235277"   "58.02444"   
## [13,] "Turkey"          "32.260328"   "40.231444"  
## [14,] "Ukraine"         "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47"      "38"
obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)
## 'data.frame':    15 obs. of  2 variables:
##  $ x: num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ y: num  48.3 38.7 38 48.2 55.9 ...
fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
##    @other: a list containing: x  y
## genetic distance

GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))

## geographic distance

Dgeo <- round(dist(fly_gen3$other), 3)

## making sure the order of rows and columns are the same in all of the matrices

ord.gen <- as.matrix(GD.pop.PairwiseFst.hierfstat)
ord.geo <- as.matrix(Dgeo)

ord.geo
##                 Austria Esparto Tuolumne Germany Denmark   Spain Finland
## Austria           0.000 138.345  136.615   6.251   9.578  16.171  14.874
## Esparto         138.345   0.000    1.922 132.104 133.381 123.358 147.282
## Tuolumne        136.615   1.922    0.000 130.377 131.705 121.591 145.629
## Germany           6.251 132.104  130.377   0.000   7.762  10.761  18.895
## Denmark           9.578 133.381  131.705   7.762   0.000  16.969  14.271
## Spain            16.171 123.358  121.591  10.761  16.969   0.000  29.633
## Finland          14.874 147.282  145.629  18.895  14.271  29.633   0.000
## France           12.502 125.857  124.123   6.313  11.265   5.807  24.529
## NY-MA            90.239  48.109   46.391  83.997  85.372  75.370  99.372
## MI-WI           104.169  34.223   32.543  97.924  99.162  89.335 113.091
## PA               92.941  45.441   43.689  86.707  88.239  77.923 102.285
## Russia           19.826 156.482  154.800  25.490  23.116  35.967  10.190
## Turkey           18.178 154.314  152.537  23.912  27.074  31.008  22.625
## Ukraine          14.250 152.594  150.865  20.491  20.977  29.962  13.392
## Charlottesville  94.994  43.581   41.790  88.773  90.480  79.827 104.573
##                  France   NY-MA   MI-WI      PA  Russia  Turkey Ukraine
## Austria          12.502  90.239 104.169  92.941  19.826  18.178  14.250
## Esparto         125.857  48.109  34.223  45.441 156.482 154.314 152.594
## Tuolumne        124.123  46.391  32.543  43.689 154.800 152.537 150.865
## Germany           6.313  83.997  97.924  86.707  25.490  23.912  20.491
## Denmark          11.265  85.372  99.162  88.239  23.116  27.074  20.977
## Spain             5.807  75.370  89.335  77.923  35.967  31.008  29.962
## Finland          24.529  99.372 113.091 102.285  10.190  22.625  13.392
## France            0.000  77.755  91.692  80.445  31.719  29.473  26.749
## NY-MA            77.755   0.000  13.965   3.309 108.445 106.368 104.488
## MI-WI            91.692  13.965   0.000  11.638 122.260 120.333 118.414
## PA               80.445   3.309  11.638   0.000 111.285 108.895 107.191
## Russia           31.719 108.445 122.260 111.285   0.000  17.820   9.085
## Turkey           29.473 106.368 120.333 108.895  17.820   0.000   9.477
## Ukraine          26.749 104.488 118.414 107.191   9.085   9.477   0.000
## Charlottesville  82.492   6.244  10.631   2.971 113.486 110.753 109.240
##                 Charlottesville
## Austria                  94.994
## Esparto                  43.581
## Tuolumne                 41.790
## Germany                  88.773
## Denmark                  90.480
## Spain                    79.827
## Finland                 104.573
## France                   82.492
## NY-MA                     6.244
## MI-WI                    10.631
## PA                        2.971
## Russia                  113.486
## Turkey                  110.753
## Ukraine                 109.240
## Charlottesville           0.000
ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen
##                 Austria Esparto Tuolumne Germany Denmark  Spain Finland France
## Austria          0.0000  0.0557   0.0588  0.0128  0.0181 0.0165  0.0151 0.0212
## Esparto          0.0557  0.0000  -0.0013  0.0441  0.0490 0.0446  0.0547 0.0414
## Tuolumne         0.0588 -0.0013   0.0000  0.0468  0.0533 0.0479  0.0601 0.0457
## Germany          0.0128  0.0441   0.0468  0.0000  0.0170 0.0077  0.0136 0.0100
## Denmark          0.0181  0.0490   0.0533  0.0170  0.0000 0.0256  0.0246 0.0186
## Spain            0.0165  0.0446   0.0479  0.0077  0.0256 0.0000  0.0200 0.0158
## Finland          0.0151  0.0547   0.0601  0.0136  0.0246 0.0200  0.0000 0.0222
## France           0.0212  0.0414   0.0457  0.0100  0.0186 0.0158  0.0222 0.0000
## NY-MA            0.0439  0.0089   0.0082  0.0335  0.0403 0.0295  0.0435 0.0334
## MI-WI            0.0540  0.0130   0.0094  0.0412  0.0501 0.0396  0.0492 0.0422
## PA               0.0541  0.0141   0.0099  0.0414  0.0519 0.0393  0.0521 0.0440
## Russia           0.0169  0.0535   0.0605  0.0210  0.0234 0.0318  0.0192 0.0222
## Turkey           0.0139  0.0600   0.0618  0.0188  0.0217 0.0264  0.0163 0.0280
## Ukraine          0.0137  0.0627   0.0674  0.0196  0.0234 0.0308  0.0214 0.0281
## Charlottesville  0.0582  0.0172   0.0153  0.0466  0.0541 0.0466  0.0574 0.0448
##                  NY-MA  MI-WI     PA Russia Turkey Ukraine Charlottesville
## Austria         0.0439 0.0540 0.0541 0.0169 0.0139  0.0137          0.0582
## Esparto         0.0089 0.0130 0.0141 0.0535 0.0600  0.0627          0.0172
## Tuolumne        0.0082 0.0094 0.0099 0.0605 0.0618  0.0674          0.0153
## Germany         0.0335 0.0412 0.0414 0.0210 0.0188  0.0196          0.0466
## Denmark         0.0403 0.0501 0.0519 0.0234 0.0217  0.0234          0.0541
## Spain           0.0295 0.0396 0.0393 0.0318 0.0264  0.0308          0.0466
## Finland         0.0435 0.0492 0.0521 0.0192 0.0163  0.0214          0.0574
## France          0.0334 0.0422 0.0440 0.0222 0.0280  0.0281          0.0448
## NY-MA           0.0000 0.0057 0.0017 0.0474 0.0482  0.0535          0.0031
## MI-WI           0.0057 0.0000 0.0042 0.0537 0.0537  0.0616          0.0049
## PA              0.0017 0.0042 0.0000 0.0596 0.0564  0.0646          0.0045
## Russia          0.0474 0.0537 0.0596 0.0000 0.0167  0.0067          0.0597
## Turkey          0.0482 0.0537 0.0564 0.0167 0.0000  0.0129          0.0574
## Ukraine         0.0535 0.0616 0.0646 0.0067 0.0129  0.0000          0.0656
## Charlottesville 0.0031 0.0049 0.0045 0.0597 0.0574  0.0656          0.0000
fst.dist <- round(as.dist(ord.gen), 3)
fst.dist
##                 Austria Esparto Tuolumne Germany Denmark  Spain Finland France
## Esparto           0.056                                                       
## Tuolumne          0.059  -0.001                                               
## Germany           0.013   0.044    0.047                                      
## Denmark           0.018   0.049    0.053   0.017                              
## Spain             0.016   0.045    0.048   0.008   0.026                      
## Finland           0.015   0.055    0.060   0.014   0.025  0.020               
## France            0.021   0.041    0.046   0.010   0.019  0.016   0.022       
## NY-MA             0.044   0.009    0.008   0.034   0.040  0.029   0.044  0.033
## MI-WI             0.054   0.013    0.009   0.041   0.050  0.040   0.049  0.042
## PA                0.054   0.014    0.010   0.041   0.052  0.039   0.052  0.044
## Russia            0.017   0.054    0.060   0.021   0.023  0.032   0.019  0.022
## Turkey            0.014   0.060    0.062   0.019   0.022  0.026   0.016  0.028
## Ukraine           0.014   0.063    0.067   0.020   0.023  0.031   0.021  0.028
## Charlottesville   0.058   0.017    0.015   0.047   0.054  0.047   0.057  0.045
##                  NY-MA  MI-WI     PA Russia Turkey Ukraine
## Esparto                                                   
## Tuolumne                                                  
## Germany                                                   
## Denmark                                                   
## Spain                                                     
## Finland                                                   
## France                                                    
## NY-MA                                                     
## MI-WI            0.006                                    
## PA               0.002  0.004                             
## Russia           0.047  0.054  0.060                      
## Turkey           0.048  0.054  0.056  0.017               
## Ukraine          0.054  0.062  0.065  0.007  0.013        
## Charlottesville  0.003  0.005  0.004  0.060  0.057   0.066
Dgeo
##                 Austria Esparto Tuolumne Germany Denmark   Spain Finland
## Esparto         138.345                                                 
## Tuolumne        136.615   1.922                                         
## Germany           6.251 132.104  130.377                                
## Denmark           9.578 133.381  131.705   7.762                        
## Spain            16.171 123.358  121.591  10.761  16.969                
## Finland          14.874 147.282  145.629  18.895  14.271  29.633        
## France           12.502 125.857  124.123   6.313  11.265   5.807  24.529
## NY-MA            90.239  48.109   46.391  83.997  85.372  75.370  99.372
## MI-WI           104.169  34.223   32.543  97.924  99.162  89.335 113.091
## PA               92.941  45.441   43.689  86.707  88.239  77.923 102.285
## Russia           19.826 156.482  154.800  25.490  23.116  35.967  10.190
## Turkey           18.178 154.314  152.537  23.912  27.074  31.008  22.625
## Ukraine          14.250 152.594  150.865  20.491  20.977  29.962  13.392
## Charlottesville  94.994  43.581   41.790  88.773  90.480  79.827 104.573
##                  France   NY-MA   MI-WI      PA  Russia  Turkey Ukraine
## Esparto                                                                
## Tuolumne                                                               
## Germany                                                                
## Denmark                                                                
## Spain                                                                  
## Finland                                                                
## France                                                                 
## NY-MA            77.755                                                
## MI-WI            91.692  13.965                                        
## PA               80.445   3.309  11.638                                
## Russia           31.719 108.445 122.260 111.285                        
## Turkey           29.473 106.368 120.333 108.895  17.820                
## Ukraine          26.749 104.488 118.414 107.191   9.085   9.477        
## Charlottesville  82.492   6.244  10.631   2.971 113.486 110.753 109.240
NPP_file <- list.files("/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)

NPP.data.ext.dis <- extract(NPP.data, obj)
head(NPP.data.ext.dis)
##             layer
## [1,] 264522317824
## [2,] 391191265280
## [3,] 367969927168
## [4,] 348927459328
## [5,]  25091205120
## [6,] 268916719616
rownames(NPP.data.ext.dis) <- rownames(obj)

DNPP <- round(dist(log(NPP.data.ext.dis)), 3)
DNPP
##                 Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto           0.391                                                      
## Tuolumne          0.330   0.061                                              
## Germany           0.277   0.114    0.053                                     
## Denmark           2.355   2.747    2.685   2.632                             
## Spain             0.016   0.375    0.314   0.260   2.372                     
## Finland           0.610   1.001    0.940   0.886   1.746 0.626               
## France            0.577   0.185    0.247   0.300   2.932 0.560   1.186       
## NY-MA             0.342   0.050    0.011   0.065   2.697 0.325   0.951  0.235
## MI-WI             0.236   0.155    0.094   0.041   2.592 0.220   0.846  0.340
## PA                0.452   0.061    0.122   0.175   2.808 0.436   1.062  0.124
## Russia            0.549   0.940    0.879   0.826   1.806 0.565   0.061  1.126
## Turkey            0.626   1.017    0.956   0.903   1.729 0.643   0.017  1.203
## Ukraine           0.075   0.466    0.405   0.352   2.280 0.092   0.534  0.652
## Charlottesville   0.526   0.134    0.196   0.249   2.881 0.509   1.135  0.051
##                 NY-MA MI-WI    PA Russia Turkey Ukraine
## Esparto                                                
## Tuolumne                                               
## Germany                                                
## Denmark                                                
## Spain                                                  
## Finland                                                
## France                                                 
## NY-MA                                                  
## MI-WI           0.105                                  
## PA              0.111 0.216                            
## Russia          0.891 0.785 1.001                      
## Turkey          0.968 0.862 1.078  0.077               
## Ukraine         0.417 0.311 0.527  0.474  0.551        
## Charlottesville 0.184 0.290 0.074  1.075  1.152   0.601
## partial mantel test

part.mant <- mantel.partial(fst.dist, DNPP, Dgeo, method = "pearson", permutations = 1000)
part.mant
## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = DNPP, zdis = Dgeo, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.3608 
##       Significance: 0.000999 
## 
## Upper quantiles of permutations (null model):
##    90%    95%  97.5%    99% 
## 0.0851 0.1147 0.1445 0.1974 
## Permutation: free
## Number of permutations: 1000
part.mant2 <- mantel.partial(fst.dist, Dgeo, DNPP, method = "pearson", permutations = 1000)
part.mant2
## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = DNPP, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.9262 
##       Significance: 0.000999 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.106 0.142 0.216 0.340 
## Permutation: free
## Number of permutations: 1000
## mantel test

ibd <- mantel.randtest(fst.dist, Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
## 
## Observation: 0.9184113 
## 
## Based on 999 replicates
## Simulated p-value: 0.001 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## 9.1323592883 0.0004866897 0.0101029459
quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant

mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor
## $mgram
##         lag ngroup     mantelr  pval          llim        ulim
## [1,]   9.66     27  0.64245382 0.001  5.911163e-01  0.69701998
## [2,]  28.98     18  0.28493265 0.010  2.269430e-01  0.34959953
## [3,]  48.30      6  0.27711014 0.012  1.790647e-01  0.34965251
## [4,]  67.62      1  0.02464136 0.956 -5.169186e-17  0.03644222
## [5,]  86.94     16 -0.23755747 0.026 -3.447686e-01 -0.14792562
## [6,] 106.26     16 -0.43851519 0.001 -5.476339e-01 -0.37083721
## [7,] 125.58     11 -0.27585827 0.011 -3.444213e-01 -0.19623107
## [8,] 144.90      8 -0.39397231 0.002 -5.044866e-01 -0.27736000
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"
mantCor2 <- mgram(fst.dist, DNPP, nperm = 1000)
mantCor2
## $mgram
##            lag ngroup     mantelr  pval          llim          ulim
## [1,] 0.1825625     44  0.36681725 0.003  2.157538e-01  5.436962e-01
## [2,] 0.5476875     23 -0.07860655 0.291 -2.448150e-01  6.119091e-02
## [3,] 0.9128125     19 -0.36758903 0.004 -5.006758e-01 -2.413466e-01
## [4,] 1.2779375      5 -0.03904606 0.647 -1.879859e-01  9.156261e-02
## [5,] 1.6430625      3  0.09357366 0.325 -1.587588e-17  1.267946e-01
## [6,] 2.0081875      0  0.00000000 1.000  0.000000e+00  0.000000e+00
## [7,] 2.3733125      3  0.10248140 0.277 -4.074927e-17  1.332049e-01
## [8,] 2.7384375      7 -0.15494628 0.066 -2.148389e-01  8.845990e-18
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"
plot(mantCor, las = 1)

plot(mantCor2, las = 1)

## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations


dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)

dens2 <- MASS::kde2d(as.vector(DNPP), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)

layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))


par(mar = c(4, 4.1, 1, 1))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.07, line = 2.6, font = 2, family = "serif", las = 1)

par(mar = c(4, 4.1, 1, 1))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.07, line = 3.1, font = 2, family = "serif", las = 1)

par(mar = c(4, 4.1, 1, 1))
plot(mantCor, las = 1, cex.axis = 1, cex.lab = 1.6)
mtext("(c)", side = 2, at = 0.7, line = 2.6, font = 2, family = "serif", las = 1)

par(mar = c(4, 4.1, 1, 1))
plot(mantCor2, las = 1, cex.axis = 1, cex.lab = 1.6, ylim = c(-0.4, 0.6))
mtext("(d)", side = 2, at = 0.7, line = 3.1, font = 2, family = "serif", las = 1)

Figure 2. Patterns of isolation by distance and isolation by environment according to Partial Mantel tests and Correlograms. (a-b) Geographic and net primary production distances as a function of genetic distance. Colors represent estimated probability densities and the red line a smoothed local mean. (c-d) Mantel correlation at different distance classes as a function of geographic and net primary production distances. Filled dots indicate significant correlations.



Detecting adaptation

## extracting environmental variables from WorldClim and other sources


mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)
##   country   long     lat             Ind
## 1 Austria 15.965 48.2875   AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875    AT_Mau_14_01
## 4 Austria 15.965 48.2875    AT_Mau_14_02
## 5 Austria 15.965 48.2875    AT_Mau_15_50
## 6 Austria 15.965 48.2875    AT_Mau_15_51
## extracting NPP


NPP_file <- list.files("/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)

prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)
##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)
##             layer
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824
NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)
##            NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824
##is.na(NPP.data.ext)


## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)

files <- list.files("/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
##  [1] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
##  [2] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
##  [3] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
##  [4] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
##  [5] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
##  [6] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
##  [7] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
##  [8] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif" 
##  [9] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif" 
## [10] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif" 
## [11] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif" 
## [12] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif" 
## [13] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif" 
## [14] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
tem_sea <- stack(files[10])

coord <- mat_adap[ , c(2, 3)]
head(coord)
##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)

variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)
##      layer    layer      NPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)
##   precseason tempseason   logNPP
## 1   37.24262   744.4373 26.30119
## 2   37.24262   744.4373 26.30119
## 3   37.24262   744.4373 26.30119
## 4   37.24262   744.4373 26.30119
## 5   37.24262   744.4373 26.30119
## 6   37.24262   744.4373 26.30119
env <- cbind(mat_adap, variables)
head(env)
##   country   long     lat             Ind precseason tempseason   logNPP
## 1 Austria 15.965 48.2875   AT_gr_12_fall   37.24262   744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring   37.24262   744.4373 26.30119
## 3 Austria 15.965 48.2875    AT_Mau_14_01   37.24262   744.4373 26.30119
## 4 Austria 15.965 48.2875    AT_Mau_14_02   37.24262   744.4373 26.30119
## 5 Austria 15.965 48.2875    AT_Mau_15_50   37.24262   744.4373 26.30119
## 6 Austria 15.965 48.2875    AT_Mau_15_51   37.24262   744.4373 26.30119
str(env)
## 'data.frame':    151 obs. of  7 variables:
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ long      : num  16 16 16 16 16 ...
##  $ lat       : num  48.3 48.3 48.3 48.3 48.3 ...
##  $ Ind       : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ precseason: num  37.2 37.2 37.2 37.2 37.2 ...
##  $ tempseason: num  744 744 744 744 744 ...
##  $ logNPP    : num  26.3 26.3 26.3 26.3 26.3 ...
## confirm that genotypes and environmental data are in the same order

x <- tab(fly_gen3, NA.method = "mean")

snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)
## [1]  151 3102
gen <- snppca[order(snppca$country), ]

merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]
##               Ind 162333.A 162333.C 162335.C 162335.T
## 1   AT_gr_12_fall        1        1        1        1
## 2 AT_gr_12_spring        1        1        1        1
## 3    AT_Mau_14_01        0        2        1        1
## 4    AT_Mau_14_02        1        1        1        1
## 5    AT_Mau_15_50        1        1        1        1
rownames(merging) <- merging[ , 1]
dim(merging)
## [1]  151 3108
merging[1:5, 3101:3108]
##                 163997.T country.x country.y   long     lat precseason
## AT_gr_12_fall          0   Austria   Austria 15.965 48.2875   37.24262
## AT_gr_12_spring        0   Austria   Austria 15.965 48.2875   37.24262
## AT_Mau_14_01           0   Austria   Austria 15.965 48.2875   37.24262
## AT_Mau_14_02           1   Austria   Austria 15.965 48.2875   37.24262
## AT_Mau_15_50           0   Austria   Austria 15.965 48.2875   37.24262
##                 tempseason   logNPP
## AT_gr_12_fall     744.4373 26.30119
## AT_gr_12_spring   744.4373 26.30119
## AT_Mau_14_01      744.4373 26.30119
## AT_Mau_14_02      744.4373 26.30119
## AT_Mau_15_50      744.4373 26.30119
merging <- merging[ , -c(1, 3102, 3103, 3104, 3105, 3106, 3107, 3108)]
merging[1:5, 3085:3100]
##                 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall          1        1        1        1        2        0        1
## AT_gr_12_spring        2        0        1        1        2        0        1
## AT_Mau_14_01           1        1        2        0        1        1        1
## AT_Mau_14_02           1        1        2        0        2        0        1
## AT_Mau_15_50           1        1        1        1        1        1        1
##                 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall          1        1        1        2        0        1        1
## AT_gr_12_spring        1        1        1        2        0        1        1
## AT_Mau_14_01           1        1        1        2        0        1        1
## AT_Mau_14_02           1        1        1        2        0        1        1
## AT_Mau_15_50           1        1        1        2        0        1        1
##                 163997.A 163997.T
## AT_gr_12_fall          2        0
## AT_gr_12_spring        2        0
## AT_Mau_14_01           2        0
## AT_Mau_14_02           1        1
## AT_Mau_15_50           2        0
dim(merging)
## [1]  151 3100
genomic <- merging

env$Ind <- as.factor(env$Ind)
env$Ind <- as.character(levels(env$Ind))
dim(snppca)
## [1]  151 3102
snppca[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
snppca[1:5, 3085:3102]
##                 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall          1        1        1        1        2        0        1
## AT_gr_12_spring        2        0        1        1        2        0        1
## AT_Mau_14_01           1        1        2        0        1        1        1
## AT_Mau_14_02           1        1        2        0        2        0        1
## AT_Mau_15_50           1        1        1        1        1        1        1
##                 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall          1        1        1        2        0        1        1
## AT_gr_12_spring        1        1        1        2        0        1        1
## AT_Mau_14_01           1        1        1        2        0        1        1
## AT_Mau_14_02           1        1        1        2        0        1        1
## AT_Mau_15_50           1        1        1        2        0        1        1
##                 163997.A 163997.T             Ind country
## AT_gr_12_fall          2        0   AT_gr_12_fall Austria
## AT_gr_12_spring        2        0 AT_gr_12_spring Austria
## AT_Mau_14_01           2        0    AT_Mau_14_01 Austria
## AT_Mau_14_02           1        1    AT_Mau_14_02 Austria
## AT_Mau_15_50           2        0    AT_Mau_15_50 Austria
snppca <- snppca[ , -c(3101, 3102)]
snppca[1:5, 3085:3100]
##                 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall          1        1        1        1        2        0        1
## AT_gr_12_spring        2        0        1        1        2        0        1
## AT_Mau_14_01           1        1        2        0        1        1        1
## AT_Mau_14_02           1        1        2        0        2        0        1
## AT_Mau_15_50           1        1        1        1        1        1        1
##                 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall          1        1        1        2        0        1        1
## AT_gr_12_spring        1        1        1        2        0        1        1
## AT_Mau_14_01           1        1        1        2        0        1        1
## AT_Mau_14_02           1        1        1        2        0        1        1
## AT_Mau_15_50           1        1        1        2        0        1        1
##                 163997.A 163997.T
## AT_gr_12_fall          2        0
## AT_gr_12_spring        2        0
## AT_Mau_14_01           2        0
## AT_Mau_14_02           1        1
## AT_Mau_15_50           2        0
dim(snppca)
## [1]  151 3100
identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order
## [1] TRUE
names(env)
## [1] "country"    "long"       "lat"        "Ind"        "precseason"
## [6] "tempseason" "logNPP"
env.pca <- rda(env[ , c(2, 3, 5, 6, 7)], scale = T)
summary(env.pca)$cont
## $importance
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5
## Eigenvalue            2.1427 1.2189 0.8322 0.48306 0.32307
## Proportion Explained  0.4285 0.2438 0.1664 0.09661 0.06461
## Cumulative Proportion 0.4285 0.6723 0.8388 0.93539 1.00000
quartz()
par(las = 1)
screeplot(env.pca, main = " ")

round(scores(env.pca, choices = 1:5, display = "species", scaling = 0), digits = 3)
##               PC1    PC2    PC3    PC4    PC5
## long       -0.592 -0.002 -0.149 -0.421  0.671
## lat        -0.598  0.068  0.080 -0.336 -0.720
## precseason  0.127 -0.689  0.635 -0.321  0.049
## tempseason  0.004  0.681  0.716 -0.022  0.151
## logNPP      0.525  0.236 -0.236 -0.779 -0.077
## attr(,"const")
## [1] 5.233176
pred.PC1 <- scores(env.pca, choices = 1, display = "sites", scaling = 0)


## determine K

## I’ll use a broken stick criterion to determine K

par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")

## genomic data

gen.pca <- rda(snppca, scale = T)

par(las = 1)
screeplot(gen.pca, main = " ", bstick = TRUE, type = "lines")

## for the genomic data, I can see that two of the PCs have eigenvalues greater than random (greater than the broken stick values in red). This effectively means that K = 2 for the dataset, based on a PCA assessment

K <- 2:9


fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[1]) ## c.ange K as you see fit
fly.lfmm2 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[2]) ## c.ange K as you see fit
fly.lfmm3 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[3]) ## c.ange K as you see fit
fly.lfmm4 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[4]) ## c.ange K as you see fit
fly.lfmm5 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[5]) ## c.ange K as you see fit
fly.lfmm6 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[6]) ## c.ange K as you see fit
fly.lfmm7 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[7]) ## c.ange K as you see fit
fly.lfmm8 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[8]) ## c.ange K as you see fit

fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm1, calibrate = "gif")
fly.pv2 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm2, calibrate = "gif")
fly.pv3 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm3, calibrate = "gif")
fly.pv4 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm4, calibrate = "gif")
fly.pv5 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm5, calibrate = "gif")
fly.pv6 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm6, calibrate = "gif")
fly.pv7 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm7, calibrate = "gif")
fly.pv8 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm8, calibrate = "gif")

names(fly.pv1) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values
## [1] "B"                 "epsilon.sigma2"    "B.sigma2"         
## [4] "score"             "pvalue"            "gif"              
## [7] "calibrated.score2" "calibrated.pvalue"
## let’s look at the genomic inflation factor (GIF)

fly.pv1$gif
##      PC1 
## 1.522458
fly.pv2$gif
##      PC1 
## 1.354105
fly.pv3$gif
##      PC1 
## 1.378276
fly.pv4$gif
##      PC1 
## 1.307732
fly.pv5$gif
##      PC1 
## 1.286765
fly.pv6$gif
##      PC1 
## 1.310563
fly.pv7$gif
##      PC1 
## 1.314588
fly.pv8$gif
##      PC1 
## 1.323961
## an appropriately calibrated set of tests will have a GIF of around 1

## let’s look at how application of the GIF to the p-values impacts the p-value distribution


quartz()
layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4,
                5, 5, 6, 6,
                5, 5, 6, 6,
                7, 7, 8, 8,
                7, 7, 8, 8), nrow = 8, ncol = 4, byrow = TRUE))

par(mar = c(5, 5, 0.2, 1), las = 1)

hist(fly.pv1$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 2", bty = "n")
hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 2", bty = "n")
hist(fly.pv2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 3", bty = "n")
hist(fly.pv2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 3", bty = "n")
hist(fly.pv3$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 4", bty = "n")
hist(fly.pv3$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 4", bty = "n")
hist(fly.pv4$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 5", bty = "n")
hist(fly.pv4$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 5", bty = "n")

quartz()
layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4,
                5, 5, 6, 6,
                5, 5, 6, 6,
                7, 7, 8, 8,
                7, 7, 8, 8), nrow = 8, ncol = 4, byrow = TRUE))

par(mar = c(5, 5, 0.2, 1), las = 1)

hist(fly.pv5$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 6", bty = "n")
hist(fly.pv5$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 6", bty = "n")
hist(fly.pv6$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 7", bty = "n")
hist(fly.pv6$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 7", bty = "n")
hist(fly.pv7$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 8", bty = "n")
hist(fly.pv7$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 8", bty = "n")
hist(fly.pv8$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 9", bty = "n")
hist(fly.pv8$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")

## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)

fly.qv1 <- qvalue(fly.pv1$calibrated.pvalue)$qvalues
length(which(fly.qv1 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv2 <- qvalue(fly.pv2$calibrated.pvalue)$qvalues
length(which(fly.qv2 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv3 <- qvalue(fly.pv3$calibrated.pvalue)$qvalues
length(which(fly.qv3 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv4 <- qvalue(fly.pv4$calibrated.pvalue)$qvalues
length(which(fly.qv4 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv5 <- qvalue(fly.pv5$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv6 <- qvalue(fly.pv6$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv7 <- qvalue(fly.pv7$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv8 <- qvalue(fly.pv8$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
## using K from 2 to 9, the default GIF correction, and an FDR threshold of 0.05, I detected 2 candidate SNPs under selection in response to the PC1 environmental predictor

(fly.FDR.5 <- colnames(snppca)[which(fly.qv5 < 0.05)]) ## identify which SNPs these are
## [1] "163005.A" "163005.G" "163496.G" "163496.A"
## visualizing results


## Manhattan plot with causal loci
 
qvalues <- fly.qv5

row <- as.data.frame(qvalues)
str(row)
## 'data.frame':    3100 obs. of  1 variable:
##  $ PC1: num  0.959 0.959 0.846 0.846 0.997 ...
rownames(row) <- 1:3100

(causal.set <- as.numeric(rownames(row)[which(row < 0.05)]))
## [1] 1251 1252 2161 2162
(fly.FDR.5 <- gsub("[^0-9]", "", fly.FDR.5))
## [1] "163005" "163005" "163496" "163496"
quartz()
plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1)
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.05), fly.FDR.5, col = "red")

quartz()
layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                0, 2, 2, 0,
                0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))

par(las = 1, mar = c(4, 4, 1, 0.6))
hist(fly.pv5$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", paste("K = 6\nGIF = ", round(fly.pv5$gif, 2)), bty = "n")
mtext("(a)", side = 2, at = 370, line = 2.5, las = 1, font = 2, family = "serif")


plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("black", 0.1), xlab = "SNP", las = 1)
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.05), fly.FDR.5, col = "red")
mtext("(b)", side = 2, at = 1.45, line = 2.5, las = 1, font = 2, family = "serif")

Figure 6. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor. (b) Manhattan plot of loci (SNPs) potentially affected by the PC1 predictor variable. Loci highlighted in red were considered to be under selection according to a False Discovery Rate of 0.05.